#Set As Working Directory
setwd("~/AIT 582/aws-honeypot-attack-data")

#loading required packages 
library(ggplot2)
library(dplyr)
library(RColorBrewer)
library(rworldmap)
library(ggthemes)
library(plotly)
library(randomForest)
library(plyr)
library(readr)
library(rpart)
library(rpart.plot)
library(lubridate)
library(knitr)
library(plotrix)
library(MASS)
library(class)
library(ISOweek)
library(stringr)

#Importing the dataset
HP<-read.csv('AWS_Honeypot_marx-geo.csv',sep=',',stringsAsFactors=F)
HP %>% dplyr::filter(latitude>100) %>% dplyr::select(srcstr, country, locale, latitude, longitude) %>% head(10)

#Data Preprocessing 
# cleanup the missing geo locations
HP1<- data.frame(HP %>% filter(!is.na(latitude) & !is.na(longitude)))
# filter the wrong coded latitude
dfc1 <-data.frame(HP1 %>% filter(latitude>100))
dfc2 <-data.frame(HP1 %>% filter(latitude<=100))

# switch the X ad latitude column
dfc1$latitude <- dfc1$X
# re-rbind the subsets
attackloc <- data.frame(rbind(dfc2,dfc1))
# remove the now useless X column
attackloc$X<-NULL

#Setting a Reference data
RES <- data.frame(attackloc %>% 
                    dplyr::group_by(srcstr,country, longitude, latitude, locale) %>% 
                    dplyr::summarize(count=n()) %>% arrange(-count))

# select the top bad IP.adresses and make a new name to identify unique location: IP.adress + location
topattacks<-data.frame(RES %>% top_n(10))
topattacks$fullIP <-paste0(topattacks$srcstr,'(',topattacks$locale,')')
topattacks

#Removing all the unneccesary feilds
RES1 <- data.frame(HP1%>% 
                     dplyr::group_by(datetime,host,src,proto,country, longitude, latitude, locale) %>% 
                     dplyr::summarize(count=n()) %>% arrange(-count))
RES1

RES1 %>% 
  mutate(
    datetime = datetime %>% parse_date_time(order = "%m/%d/%y %H:%M"),
    host= host %>% as.factor, 
    proto = proto %>% as.factor, 
    country = country %>% as.factor,
    lmp_country = country )
head(RES1)
sum(is.na(RES1))

#pie chart for protocols used
fig1 <- plot_ly(HP1,type='pie', labels= ~proto, 
                textinfo='label+percent',
                insidetextorientation='radial')%>% layout(title = 'Name Of Packet Protocal Type used By Attacker')

fig1

#Most attacks for Protocol type.
TableP<-table(HP1$proto)
summary(TableP)
TableP

#hypothesis tests
t.test(HP1$spt,HP1$dpt)
t.test(HP1$latitude,HP1$latitude)

#test to determine factor relationship
chisq.test(HP1$host,HP1$proto)

chisq.test(table(HP1$country))

chisq.test(HP1$country,HP1$proto)

#table for country and time of attack
TableT<-table(HP1$country,HP1$datetime)
head(TableT,10)

#Analyzing which country has the highest effected ports.
HPF<- HP1[-c(1,5,9,11,12,13,14,15,16)]
head(HPF)

summary(HPF)
sum(is.na(HPF))

Destinationport<-HPF[,"dpt"]!=0
Sourceport<-HPF[,"spt"]!=0
Ports<-!Destinationport & !Sourceport
Ports

#displaying high ports presence for countries
CountryNames<-HPF[,7]
CountryNames
CountryN<-sub("","_",CountryNames)
names(Ports)<-CountryN
PortD<-sapply(Ports,function(x) if(1) "Ports" else "Port Not Available")
PortD
max(HPF$country)

# Geographical location of the attacks 
#Displaying the attacked areas with colour brewer and map data.
histo<-ggplotGrob(
  topattacks %>% ggplot(aes(x=reorder(fullIP,count),y=count)) + 
    geom_bar(stat='identity') + coord_flip() + theme_fivethirtyeight() + 
    theme(axis.text=element_text(size=5)) + labs(subtitle='top 10 bad IP addresses'))
histo

countries_map <-map_data("world")
world_map<-ggplot() + 
  geom_map(data = countries_map, map = countries_map,aes(x = long, y = lat, map_id = region, group = group), fill = "green", color = "white", size = 0.1) + 
  theme_minimal() + 
  theme(axis.text=element_blank())

world_map + 
  geom_point(data=RES,aes(x=longitude,y=latitude,size=count,color=count),alpha=.7) +
  scale_color_gradient2(name='',low = "#AA4371", mid = "blue", high = "orange") + 
  guides(color=FALSE,size=F) + 
  scale_radius(range=c(1,20)) + 
  labs(title=' Attackers IP Address Locations Identified by AWS Honey pot',
       subtitle=' April-2013 to August-2013',x='longitude',y='latitude') + 
  annotation_custom(grob = histo, xmin = 80, xmax = 210, ymin = -100, ymax = -40)
summary(RES1)

# Returning timedate into splits of time,day,month and year
attackloc$month<-sapply(attackloc$datetime,function(x) as.numeric(strsplit(strsplit(x,' ')[[1]][1],'/')[[1]][1]))
attackloc$day<-sapply(attackloc$datetime,function(x) as.numeric(strsplit(strsplit(x,' ')[[1]][1],'/')[[1]][2]))
attackloc$year<-2000 + sapply(attackloc$datetime,function(x) as.numeric(strsplit(strsplit(x,' ')[[1]][1],'/')[[1]][3]))
attackloc$hour<-sapply(attackloc$datetime,function(x) as.numeric(strsplit(strsplit(x,' ')[[1]][2],':')[[1]][1]))
attackloc$min<-sapply(attackloc$datetime,function(x) as.numeric(strsplit(strsplit(x,' ')[[1]][2],':')[[1]][2]))
attackloc$DateTS<-as.POSIXct(
  paste0(attackloc$year,'-',
         attackloc$month,'-',
         attackloc$day,' ',
         attackloc$hour,':',
         attackloc$min,':00'),format= "%Y-%m-%d %H:%M:%S")

topIPaddress<-topattacks$srcstr
attackers<- data.frame(attackloc%>% dplyr::filter(srcstr %in% topIPaddress))
attackers
max(attackers$srcstr)
max(attackers$src )
max(attackers$country )
max(attackers$locale )
max(attackers$datetime )
max(attackers$host )
max(attackers$proto )

#Displaying some of the maximum Observations in each feild
cat("The main attacked Source IPaddress and packet are",max(attackers$srcstr), "and", max(attackers$src ))
cat("The main attacked Country and locale are",max(attackers$country), "and", max(attackers$locale ))
cat("The main attacked Host and Protocal used are",max(attackers$host ), "and", max(attackers$proto ))

#Time analysis on Top attackers IP addresses
lims <- as.POSIXct(strptime(c("2013-03-01 00:00:00","2013-10-01 23:59:59"), format = "%Y-%m-%d %H:%M:%S"))
attackers %>% 
  dplyr::select(year, month, day, srcstr) %>% 
  mutate(dd = as.POSIXct(as.Date(paste0(year,'-',month,'-',day), format= "%Y-%m-%d"))) %>%
  dplyr::group_by(srcstr,dd) %>% 
  dplyr::summarize(count=n()) %>% 
  ggplot(aes(x=dd,y=count,group=1)) + geom_histogram(stat='identity',aes(group=1)) + 
  theme_gray() + labs(title= 'Month and Day wise Top Attackers IPaddress ',x='Time Period',y='Attacks Count')+
  scale_x_datetime(limits =lims) + facet_wrap(~srcstr, ncol=2, scales='free')

#Additional Visuals on honeypot host to Detect the attack trends
attackloc %>% dplyr::select(year, month, day, host)%>% 
  mutate(dd = as.POSIXct(as.Date(paste0(year,'-',month,'-',day), format= "%Y-%m-%d"))) %>%
  dplyr::group_by(host,dd)%>% 
  dplyr::summarize(count=n()) %>% 
  ggplot(aes(x=dd,y=count,group=host,fill=host)) + geom_histogram(stat='identity',aes(group=host)) + 
  theme_dark() + facet_wrap(~host,scales='free',ncol=2) +
  theme(legend.position='left',legend.direction='vertical',strip.text.x = element_text(size=0)) +
  labs(title='Detection by Attacks on Hosts',x='Time Period',y='Attacks Recorded',legend='Host Names')

#list of Unique observations 
unique(RES1$country)
unique(RES1$host)
unique(RES1$proto)
unique(RES1$locale)

RES1$full_ip<- paste(RES1$country,RES1$locale,RES1$src,sep="-")
RES1$full_ip_t<-paste(RES1$country,RES1$locale,RES1$src,RES1$datetime,sep="-")

#Some Unique Observations in the data.
Uniquehost<- unique((RES1$host))
uniqueScr<- unique(RES1$full_ip_t)
head(Uniquehost)
sample(uniqueScr,20)

mod2 <- glm(data=RES1,count ~ country + locale + proto+ host )

#summary
summary(mod2)

#significance and confidence level of model fit
sigma(mod2)*10/mean(RES1$count)
confint.lm(mod2)

#Train data set for prediction
#Split data into train and test set
set.seed(1)
train <-sample(0.05:length(RES1$count), length(RES1$count)*0.05)
test <- sample(0.1:length(RES1$count), length(RES1$count)*0.1)

RES1.train <- RES1[train,]
RES1.test <- RES1[test,]
str(RES1)
unique(RES1)
dim(RES1)
dim(RES1.test)
dim(RES1.train)
summary(RES1.test)
summary(RES1.train)
#Visualizing location(country and locale) wise source and time.
str(RES1$full_ip)
str(RES1$full_ip_t)

#Logistic Regression
bestmodel<-glm(count~ host+src+proto,data=RES1.train,family = binomial)
#best fit
mylogit.probs1<-predict(bestfit,validation,type="response")
mylogit.pred2[mylogit.probs1 >0.5] = "most attacks"
table(mylogit.pred2, RES1.train$count)
#accuracy 73.90%
#We Found that in the initial fit the model had accuracy below 30 and after the prediction it is seen 
#that most of the attackers use common IP feilds for frequent attacks on host from a specific location.
#The probability of single attacks is also high as there are many hackers.
#we have fit a best statistic analysis and data exploration from initial data not providing any results 
#to unique observances.

#R couldnot not run our RF observations due to the memory and dataset comprise 
#So we could only fit logistic regression and linear model in R
#We made Use of Knime tool for Random Forest and Classification and found nearly 500 distint trees.
#The highest accuracy using R was found to be 81%.